In [1]:
#package
import pandas as pd
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statistics import median
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
In [2]:
# Load data
path = 'D:/USA 2021 Spring/Data_analysis/train.csv'
traindf = pd.read_csv(path)
In [3]:
path = 'D:/USA 2021 Spring/Data_analysis/users.csv'
userdf = pd.read_csv(path)
In [4]:
path = 'D:/USA 2021 Spring/Data_analysis/test.csv'
testdf = pd.read_csv(path)
In [5]:
traindf = pd.merge(traindf, userdf, on = 'user_id')
testdf = pd.merge(testdf, userdf, on = 'user_id')
display(traindf.head(), testdf.head())
country_code grass_date user_id subject_line_length last_open_day last_login_day last_checkout_day open_count_last_10_days open_count_last_30_days open_count_last_60_days ... checkout_count_last_10_days checkout_count_last_30_days checkout_count_last_60_days open_flag row_id attr_1 attr_2 attr_3 age domain
0 4 2019-07-16 00:00:00+08:00 43 44 19 6 18 0 2 4 ... 0 5 10 0 0 1.0 1.0 2.0 47.0 @gmail.com
1 4 2019-07-16 00:00:00+08:00 102 44 9 4 8 2 9 17 ... 1 1 4 1 1 1.0 1.0 2.0 25.0 @hotmail.com
2 6 2019-07-16 00:00:00+08:00 177 49 14 5 5 0 4 12 ... 5 19 27 0 2 NaN 1.0 0.0 NaN @yahoo.com
3 1 2019-07-16 00:00:00+08:00 184 49 49 9 53 0 0 1 ... 1 3 6 0 3 1.0 1.0 2.0 24.0 @yahoo.com
4 6 2019-07-16 00:00:00+08:00 221 49 227 6 221 0 0 0 ... 0 0 0 0 4 NaN 1.0 0.0 NaN @hotmail.com

5 rows × 23 columns

country_code grass_date user_id subject_line_length last_open_day last_login_day last_checkout_day open_count_last_10_days open_count_last_30_days open_count_last_60_days ... login_count_last_60_days checkout_count_last_10_days checkout_count_last_30_days checkout_count_last_60_days row_id attr_1 attr_2 attr_3 age domain
0 6 2019-09-03 00:00:00+08:00 0 35 27 2 13 2 3 4 ... 134 0 6 18 0 NaN 1.0 0.0 NaN @gmail.com
1 6 2019-09-03 00:00:00+08:00 130 35 7 5 383 1 1 1 ... 5 0 0 0 1 NaN 1.0 0.0 NaN @gmail.com
2 5 2019-09-03 00:00:00+08:00 150 25 34 1 3 0 0 0 ... 38 2 2 2 2 1.0 1.0 2.0 33.0 @gmail.com
3 1 2019-09-03 00:00:00+08:00 181 36 63 5 5 0 0 0 ... 173 2 5 5 3 1.0 1.0 2.0 22.0 @yahoo.com
4 5 2019-09-03 00:00:00+08:00 192 23 6 5 54 0 0 0 ... 39 0 0 2 4 NaN 1.0 0.0 NaN @gmail.com

5 rows × 22 columns

Step 1 Exploration and Data Clean

Data information

In [6]:
traindf.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 73539 entries, 0 to 73538
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   country_code                 73539 non-null  int64  
 1   grass_date                   73539 non-null  object 
 2   user_id                      73539 non-null  int64  
 3   subject_line_length          73539 non-null  int64  
 4   last_open_day                73539 non-null  object 
 5   last_login_day               73539 non-null  object 
 6   last_checkout_day            73539 non-null  object 
 7   open_count_last_10_days      73539 non-null  int64  
 8   open_count_last_30_days      73539 non-null  int64  
 9   open_count_last_60_days      73539 non-null  int64  
 10  login_count_last_10_days     73539 non-null  int64  
 11  login_count_last_30_days     73539 non-null  int64  
 12  login_count_last_60_days     73539 non-null  int64  
 13  checkout_count_last_10_days  73539 non-null  int64  
 14  checkout_count_last_30_days  73539 non-null  int64  
 15  checkout_count_last_60_days  73539 non-null  int64  
 16  open_flag                    73539 non-null  int64  
 17  row_id                       73539 non-null  int64  
 18  attr_1                       45347 non-null  float64
 19  attr_2                       73260 non-null  float64
 20  attr_3                       73539 non-null  float64
 21  age                          45347 non-null  float64
 22  domain                       73539 non-null  object 
dtypes: float64(4), int64(14), object(5)
memory usage: 13.5+ MB
In [7]:
traindf.describe()
Out[7]:
country_code user_id subject_line_length open_count_last_10_days open_count_last_30_days open_count_last_60_days login_count_last_10_days login_count_last_30_days login_count_last_60_days checkout_count_last_10_days checkout_count_last_30_days checkout_count_last_60_days open_flag row_id attr_1 attr_2 attr_3 age
count 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.000000 73539.00000 45347.000000 73260.000000 73539.000000 45347.000000
mean 2.533159 63048.595358 43.656373 0.911829 2.780457 5.019364 8.382287 25.284053 49.010430 0.909898 2.796040 5.420960 0.155781 36769.00000 0.999757 0.997256 1.077863 31.445564
std 1.675251 36068.006037 11.219323 1.754052 4.570239 7.902313 13.424436 37.853189 71.819327 2.887416 7.686064 13.980182 0.362650 21229.02506 0.015573 0.052308 1.035252 11.797405
min 1.000000 2.000000 10.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 -17.000000
25% 1.000000 31679.500000 34.000000 0.000000 0.000000 0.000000 0.000000 1.000000 3.000000 0.000000 0.000000 0.000000 0.000000 18384.50000 1.000000 1.000000 0.000000 24.000000
50% 2.000000 63340.000000 43.000000 0.000000 1.000000 2.000000 3.000000 11.000000 22.000000 0.000000 0.000000 1.000000 0.000000 36769.00000 1.000000 1.000000 1.000000 29.000000
75% 3.000000 94731.500000 52.000000 1.000000 3.000000 6.000000 12.000000 35.000000 69.000000 1.000000 3.000000 6.000000 0.000000 55153.50000 1.000000 1.000000 2.000000 36.000000
max 7.000000 127925.000000 75.000000 31.000000 92.000000 135.000000 254.000000 735.000000 1260.000000 157.000000 531.000000 783.000000 1.000000 73538.00000 1.000000 1.000000 4.000000 118.000000
In [8]:
def get_summary_df(df):
    
    columns = df.columns.tolist()
    
    dtypes = []
    unique_counts = []
    missing_counts = []
    missing_percentages = []
    total_counts = [df.shape[0]] * len(columns)

    for column in columns:
        dtype = str(df[column].dtype)
        dtypes.append(dtype)
        
        unique_count = df[column].nunique()
        unique_counts.append(unique_count)

        missing_count = df[column].isnull().sum()
        missing_counts.append(missing_count)
        
        missing_percentage = round((missing_count/df.shape[0]) * 100, 2)
        missing_percentages.append(missing_percentage)
        


    summary_df = pd.DataFrame({
        "column": columns,
        "dtype": dtypes,
        "unique_count": unique_counts,
        "missing_count": missing_counts,
        "missing_percentage": missing_percentages,
        "total_count": total_counts,
    })
    
    summary_df = summary_df.sort_values(by="missing_percentage", ascending=False).reset_index(drop=True)
    
    return summary_df
In [9]:
get_summary_df(traindf)
Out[9]:
column dtype unique_count missing_count missing_percentage total_count
0 age float64 99 28192 38.34 73539
1 attr_1 float64 2 28192 38.34 73539
2 attr_2 float64 2 279 0.38 73539
3 country_code int64 7 0 0.00 73539
4 login_count_last_60_days int64 637 0 0.00 73539
5 attr_3 float64 5 0 0.00 73539
6 row_id int64 73539 0 0.00 73539
7 open_flag int64 2 0 0.00 73539
8 checkout_count_last_60_days int64 219 0 0.00 73539
9 checkout_count_last_30_days int64 142 0 0.00 73539
10 checkout_count_last_10_days int64 71 0 0.00 73539
11 login_count_last_30_days int64 376 0 0.00 73539
12 grass_date object 47 0 0.00 73539
13 login_count_last_10_days int64 168 0 0.00 73539
14 open_count_last_60_days int64 76 0 0.00 73539
15 open_count_last_30_days int64 46 0 0.00 73539
16 open_count_last_10_days int64 20 0 0.00 73539
17 last_checkout_day object 947 0 0.00 73539
18 last_login_day object 893 0 0.00 73539
19 last_open_day object 549 0 0.00 73539
20 subject_line_length int64 56 0 0.00 73539
21 user_id int64 72845 0 0.00 73539
22 domain object 11 0 0.00 73539
In [10]:
for col in traindf.columns:
    if traindf[col].dtype == "object":
        print (traindf[col].unique())
['2019-07-16 00:00:00+08:00' '2019-08-02 00:00:00+08:00'
 '2019-08-17 00:00:00+08:00' '2019-07-19 00:00:00+08:00'
 '2019-07-18 00:00:00+08:00' '2019-07-30 00:00:00+08:00'
 '2019-07-23 00:00:00+08:00' '2019-07-17 00:00:00+08:00'
 '2019-08-25 00:00:00+08:00' '2019-09-02 00:00:00+08:00'
 '2019-08-12 00:00:00+08:00' '2019-08-29 00:00:00+08:00'
 '2019-08-09 00:00:00+08:00' '2019-08-16 00:00:00+08:00'
 '2019-08-30 00:00:00+08:00' '2019-07-25 00:00:00+08:00'
 '2019-08-26 00:00:00+08:00' '2019-07-28 00:00:00+08:00'
 '2019-08-01 00:00:00+08:00' '2019-08-13 00:00:00+08:00'
 '2019-08-14 00:00:00+08:00' '2019-08-03 00:00:00+08:00'
 '2019-08-27 00:00:00+08:00' '2019-08-28 00:00:00+08:00'
 '2019-08-23 00:00:00+08:00' '2019-08-04 00:00:00+08:00'
 '2019-08-20 00:00:00+08:00' '2019-07-22 00:00:00+08:00'
 '2019-08-06 00:00:00+08:00' '2019-08-24 00:00:00+08:00'
 '2019-07-24 00:00:00+08:00' '2019-09-01 00:00:00+08:00'
 '2019-07-26 00:00:00+08:00' '2019-08-07 00:00:00+08:00'
 '2019-08-18 00:00:00+08:00' '2019-08-19 00:00:00+08:00'
 '2019-07-20 00:00:00+08:00' '2019-07-29 00:00:00+08:00'
 '2019-08-08 00:00:00+08:00' '2019-08-21 00:00:00+08:00'
 '2019-07-21 00:00:00+08:00' '2019-08-05 00:00:00+08:00'
 '2019-07-27 00:00:00+08:00' '2019-08-15 00:00:00+08:00'
 '2019-08-22 00:00:00+08:00' '2019-08-11 00:00:00+08:00'
 '2019-08-10 00:00:00+08:00']
['19' '9' '14' '49' '227' '5' '13' '163' '7' '130' '27' 'Never open' '4'
 '24' '28' '12' '23' '70' '83' '26' '17' '2' '3' '182' '58' '22' '20' '10'
 '8' '82' '87' '41' '115' '207' '16' '11' '29' '77' '48' '34' '263' '25'
 '30' '56' '21' '80' '180' '91' '90' '47' '84' '64' '18' '61' '401' '78'
 '86' '32' '76' '75' '55' '38' '15' '39' '36' '71' '169' '59' '224' '85'
 '143' '208' '72' '140' '62' '60' '296' '164' '362' '125' '35' '168' '74'
 '73' '154' '210' '46' '33' '53' '69' '37' '93' '221' '233' '6' '347' '42'
 '116' '54' '148' '340' '66' '96' '89' '361' '134' '701' '79' '43' '107'
 '293' '379' '268' '63' '216' '138' '446' '40' '145' '104' '98' '280'
 '369' '249' '97' '103' '170' '57' '301' '552' '246' '267' '81' '297' '51'
 '466' '540' '306' '422' '44' '230' '174' '108' '314' '67' '229' '298'
 '326' '106' '333' '45' '31' '211' '65' '52' '95' '315' '50' '100' '102'
 '158' '228' '256' '226' '105' '454' '277' '368' '191' '244' '137' '181'
 '383' '203' '234' '295' '279' '88' '175' '288' '173' '290' '183' '294'
 '269' '1' '223' '92' '285' '321' '123' '313' '199' '176' '155' '124'
 '275' '299' '165' '271' '112' '253' '160' '239' '272' '141' '153' '139'
 '68' '189' '147' '94' '152' '109' '213' '185' '144' '327' '202' '371'
 '502' '336' '433' '177' '485' '119' '217' '142' '463' '310' '197' '437'
 '149' '464' '128' '132' '135' '198' '231' '248' '259' '316' '184' '264'
 '114' '117' '129' '136' '126' '151' '289' '495' '205' '201' '543' '342'
 '235' '714' '113' '421' '287' '178' '156' '204' '262' '179' '162' '282'
 '127' '254' '252' '266' '146' '200' '225' '118' '196' '187' '121' '101'
 '193' '319' '338' '322' '251' '260' '241' '247' '258' '214' '270' '273'
 '167' '131' '192' '222' '304' '219' '305' '215' '283' '302' '242' '150'
 '238' '274' '206' '159' '186' '261' '286' '549' '445' '358' '303' '325'
 '257' '307' '334' '291' '209' '329' '195' '237' '284' '300' '265' '278'
 '212' '190' '188' '172' '122' '133' '483' '460' '240' '588' '318' '349'
 '372' '520' '498' '684' '424' '245' '419' '456' '480' '345' '255' '335'
 '545' '111' '220' '761' '741' '697' '517' '161' '276' '157' '194' '308'
 '387' '610' '510' '365' '389' '595' '232' '417' '561' '743' '554' '711'
 '548' '481' '386' '250' '626' '351' '439' '110' '337' '309' '415' '354'
 '166' '312' '236' '120' '651' '355' '472' '344' '374' '99' '328' '569'
 '381' '171' '628' '392' '384' '774' '608' '377' '330' '350' '356' '734'
 '658' '655' '243' '292' '413' '425' '746' '564' '395' '798' '783' '643'
 '400' '352' '605' '484' '771' '800' '465' '727' '675' '331' '332' '479'
 '404' '525' '702' '533' '492' '281' '511' '582' '359' '416' '429' '766'
 '722' '423' '414' '428' '399' '324' '451' '430' '488' '516' '526' '555'
 '366' '385' '461' '473' '639' '607' '346' '320' '505' '452' '393' '402'
 '622' '435' '612' '670' '341' '786' '443' '808' '715' '339' '580' '614'
 '411' '431' '660' '441' '360' '571' '641' '506' '323' '311' '375' '317'
 '218' '585' '536' '440' '562' '573' '394' '742' '378' '343' '530' '490'
 '478' '531' '737' '457' '409' '391' '534' '418' '527' '736' '600' '602'
 '679' '705' '420' '621' '804' '450' '625' '408' '519' '631' '493' '703'
 '458' '448' '370' '603' '587' '500' '388' '547' '803' '544' '663' '397'
 '730' '763' '367' '606' '357']
['6' '4' '5' '9' '8' '3' '1' '10' '58' '44' '2' '7' '382' '135' '1396'
 '12' '11' '15' '18' '49' '114' '87' '0' '90' '25' '88' '327' '95' '22'
 '54' '91' '211' '46' '536' '13' '198' '14' '113' '299' '32' '107' '116'
 '147' '65' '36' '19' '251' '51' '26' '21' '901' '324' '349' '115' '592'
 '216' '20' '265' '31' '16' '24' '285' '245' '460' '45' '379' '106' '178'
 '86' '60' '310' '418' '27' '29' '205' '145' '472' '427' '42' '306' '899'
 '292' '220' '267' '222' '834' '101' '43' '164' '475' '206' '805' '85'
 '38' '838' '719' '832' '362' '30' '281' '53' '208' '527' '187' '799'
 '573' '57' '136' '80' '474' '508' '707' '659' '394' '300' '252' '17'
 '407' '40' '383' '117' '632' '83' '77' '679' '200' '693' '662' '503'
 '356' '188' '336' '541' '92' '456' '66' '155' '132' '584' '143' '74'
 '121' '352' '554' '81' '227' '167' '645' '571' '307' '467' '326' '260'
 '628' '37' '140' '622' '94' '62' '215' '607' '347' '23' '105' '56' '163'
 '610' '404' '39' '73' '75' '248' '34' '408' '282' '70' '100' '283' '212'
 '173' '378' '561' '581' '84' '68' '201' '33' '579' '493' '151' '582'
 '259' '102' '577' '274' '567' '396' '269' '562' '517' '28' '502' '559'
 '191' '47' '59' '403' '547' '542' '141' '519' '272' '529' '144' '526'
 '520' '521' '337' '497' '509' '256' '262' '110' '426' '82' '50' '289'
 '98' '196' '499' '428' '236' '130' '313' '96' '491' '103' '63' '468'
 '484' '193' '41' '481' '168' '370' '463' '319' '109' '48' '194' '429'
 '458' '459' '288' '445' '452' '448' '441' '402' '247' '67' '421' '425'
 '286' '128' '446' '325' '377' '61' '417' '596' '406' '175' '226' '400'
 '392' '387' '35' '366' '386' '345' '276' '380' '213' '317' '376' '371'
 '372' '368' '365' '364' '361' '353' '203' '359' '339' '131' '279' '355'
 '354' '334' '186' '348' '268' '241' '290' '333' '52' '64' '122' '224'
 '209' '161' '271' '311' '76' '207' '305' '294' '79' '202' '146' '249'
 '303' '112' '302' '296' '237' '291' '264' '278' '223' '152' '280' '189'
 '275' '228' '270' '263' '55' '261' '133' '258' '183' '231' '255' '72'
 '253' '137' '184' '111' '69' '225' '232' '157' '174' '243' '239' '142'
 '233' '139' '221' '218' '120' '182' '99' '170' '197' '192' 'Never login'
 '181' '169' '89' '159' '158' '154' '153' '125' '123' '148' '104' '129'
 '127' '124' '118' '108' '78' '93' '71' '955' '929' '431' '119' '470'
 '576' '532' '771' '637' '166' '469' '569' '375' '618' '613' '465' '603'
 '575' '609' '321' '190' '548' '323' '479' '477' '312' '442' '438' '316'
 '430' '397' '398' '298' '648' '360' '340' '342' '217' '254' '314' '240'
 '246' '176' '172' '97' '374' '284' '235' '566' '483' '595' '510' '528'
 '335' '511' '506' '126' '308' '390' '341' '346' '293' '330' '185' '138'
 '369' '476' '250' '466' '195' '234' '447' '695' '490' '551' '453' '162'
 '678' '219' '642' '558' '614' '214' '604' '524' '589' '495' '389' '422'
 '505' '487' '478' '320' '473' '464' '395' '451' '343' '436' '363' '230'
 '149' '385' '367' '357' '350' '329' '328' '331' '150' '179' '177' '171'
 '18096' '301' '439' '774' '643' '199' '757' '704' '669' '266' '513' '492'
 '488' '420' '318' '415' '391' '601' '297' '760' '393' '773' '238' '457'
 '859' '498' '384' '533' '507' '413' '416' '437' '435' '165' '273' '399'
 '244' '381' '242' '534' '847' '915' '388' '814' '694' '690' '647' '657'
 '681' '160' '587' '563' '624' '612' '544' '412' '134' '570' '552' '518'
 '338' '501' '482' '480' '440' '433' '315' '277' '180' '210' '373' '358'
 '309' '156' '987' '351' '705' '677' '675' '545' '666' '409' '663' '655'
 '620' '619' '594' '546' '585' '540' '553' '539' '204' '515' '514' '489'
 '485' '462' '432' '287' '295' '810' '698' '861' '322' '687' '635' '617'
 '578' '537' '538' '512' '450' '419' '692' '818' '401' '804' '933' '702'
 '816' '424' '858' '616' '639' '572' '800' '798' '257' '731' '713' '696'
 '684' '658' '755' '615' '525' '574' '410' '434' '423' '344' '229' '18102'
 '1291' '1354' '605' '813' '867' '826' '708' '729' '671' '332' '608' '543'
 '461' '471' '405' '18103' '961' '905' '701' '975' '560' '688' '699' '494'
 '626' '600' '496' '411' '18105' '611' '1135' '738' '557' '770' '550'
 '672' '667' '644' '633' '580' '591' '556' '304' '833' '631' '1001' '960'
 '912' '752' '767' '629' '722' '700' '665' '606' '443' '564' '685' '455'
 '18107' '875' '486' '715' '454' '414' '565' '661' '792' '736' '720' '747'
 '718' '625' '449' '668' '623' '627' '535' '1150' '907' '523' '1305'
 '1140' '1119' '951' '697' '821' '892' '881' '920' '910' '727' '888' '598'
 '586' '664' '651' '593' '630' '568' '549' '1332' '937' '1392' '1357'
 '776' '883' '765' '706' '500' '599' '555' '504' '18114' '822' '682' '602'
 '634' '742' '775' '887' '1049' '873' '904' '890' '896' '703' '812' '744'
 '709' '717' '621' '18116' '1011' '930' '924' '733' '735' '714' '1013'
 '636' '1113' '1052' '1037' '444' '710' '786' '689' '1028' '1148' '516'
 '894' '854' '753' '18122' '877' '1100' '1445' '976' '716' '884' '866'
 '845' '743' '724' '725' '646' '652' '640' '597' '18124' '902' '531' '870'
 '801' '650' '754' '1097' '730' '530' '1231' '950' '656' '793' '781' '653'
 '673' '1152' '943' '944' '921' '807' '660' '638' '641' '588' '783' '766'
 '670' '927' '1051' '654' '18129' '923' '790' '852' '815' '680' '649'
 '1002' '778' '779' '583' '18131' '522' '874' '759' '18133' '18134' '828'
 '777' '723' '674' '1288' '898' '871' '851' '1434' '1262' '876' '1091'
 '948' '839' '853' '745' '1437' '1117' '817' '1251' '18140' '889' '1380'
 '18141' '820' '734' '683' '1083']
['18' '8' '5' '53' '221' '70' '3' '28' '26' '12' '156' '2' '7' '86' '57'
 '23' '9' '6' '49' '16' '468' '1' 'Never checkout' '139' '128' '40' '22'
 '13' '4' '21' '60' '15' '420' '54' '50' '24' '123' '855' '17' '95' '29'
 '25' '10' '172' '499' '11' '190' '111' '214' '186' '58' '32' '319' '76'
 '63' '1244' '20' '175' '14' '38' '1122' '75' '83' '64' '1167' '1169' '48'
 '34' '290' '433' '168' '101' '35' '148' '39' '84' '69' '30' '37' '205'
 '166' '143' '230' '19' '1118' '124' '224' '622' '676' '939' '246' '78'
 '46' '118' '61' '62' '1024' '79' '326' '415' '36' '659' '112' '107' '47'
 '272' '434' '31' '41' '142' '243' '27' '283' '45' '413' '105' '151' '316'
 '379' '202' '43' '451' '33' '68' '533' '232' '87' '91' '147' '572' '917'
 '144' '104' '59' '403' '443' '67' '187' '306' '700' '184' '106' '66'
 '551' '456' '358' '55' '235' '452' '85' '719' '442' '51' '110' '355' '73'
 '310' '215' '527' '328' '302' '194' '259' '237' '597' '493' '71' '137'
 '188' '479' '743' '727' '746' '425' '311' '180' '81' '474' '470' '158'
 '129' '627' '119' '343' '614' '219' '125' '650' '133' '303' '120' '692'
 '108' '431' '94' '271' '321' '356' '341' '122' '675' '600' '204' '308'
 '667' '274' '162' '222' '42' '161' '654' '655' '440' '100' '97' '647'
 '211' '225' '645' '478' '173' '65' '349' '387' '134' '89' '216' '80'
 '609' '409' '611' '406' '333' '249' '570' '206' '56' '77' '594' '581'
 '185' '229' '213' '247' '584' '167' '126' '279' '195' '574' '535' '523'
 '217' '548' '502' '155' '505' '322' '278' '394' '529' '115' '534' '169'
 '109' '152' '344' '514' '269' '189' '176' '150' '200' '501' '500' '404'
 '103' '96' '481' '174' '471' '93' '407' '132' '288' '154' '312' '462'
 '131' '459' '291' '383' '146' '449' '121' '336' '416' '233' '72' '427'
 '242' '350' '275' '418' '294' '257' '159' '99' '90' '410' '183' '258'
 '389' '385' '74' '374' '378' '372' '331' '364' '365' '113' '197' '261'
 '354' '212' '52' '345' '264' '330' '231' '324' '299' '182' '320' '191'
 '250' '218' '177' '309' '268' '255' '289' '277' '116' '227' '263' '209'
 '203' '164' '245' '160' '82' '223' '192' '102' '44' '127' '140' '98'
 '130' '566' '1260' '1132' '347' '1274' '426' '170' '635' '582' '802'
 '863' '948' '210' '1000' '136' '236' '267' '138' '117' '305' '88' '179'
 '586' '705' '681' '135' '270' '473' '145' '642' '342' '178' '618' '377'
 '400' '605' '487' '114' '244' '329' '547' '92' '359' '503' '357' '332'
 '256' '297' '199' '398' '397' '395' '390' '360' '393' '157' '171' '325'
 '318' '234' '284' '207' '286' '262' '248' '254' '516' '1026' '994' '504'
 '817' '811' '141' '749' '208' '686' '165' '578' '688' '239' '610' '517'
 '496' '421' '464' '386' '461' '399' '380' '414' '352' '362' '196' '300'
 '273' '193' '201' '149' '1039' '1162' '435' '730' '466' '228' '644' '388'
 '737' '831' '453' '733' '491' '327' '335' '602' '731' '710' '543' '696'
 '658' '666' '641' '488' '313' '429' '454' '573' '486' '576' '281' '437'
 '315' '198' '241' '484' '408' '373' '367' '363' '460' '467' '220' '375'
 '436' '240' '282' '339' '396' '266' '317' '361' '340' '337' '285' '226'
 '531' '251' '369' '980' '885' '643' '401' '422' '706' '580' '678' '638'
 '603' '616' '608' '558' '560' '508' '368' '238' '298' '850' '323' '412'
 '893' '252' '265' '334' '392' '153' '348' '465' '536' '482' '382' '253'
 '402' '346' '260' '1138' '163' '595' '869' '625' '623' '803' '448' '301'
 '485' '490' '636' '556' '565' '511' '540' '515' '509' '445' '480' '457'
 '314' '438' '432' '381' '366' '338' '537' '510' '181' '742' '351' '430'
 '424' '801' '691' '411' '599' '455' '717' '649' '469' '677' '307' '683'
 '439' '672' '670' '663' '657' '526' '624' '613' '596' '604' '592' '296'
 '384' '295' '541' '506' '371' '475' '441' '376' '293' '287' '865' '756'
 '698' '631' '1047' '782' '653' '498' '549' '538' '292' '542' '1124' '835'
 '775' '878' '701' '931' '552' '930' '883' '419' '569' '769' '697' '507'
 '450' '621' '598' '575' '545' '353' '513' '446' '497' '998' '477' '819'
 '652' '805' '601' '304' '519' '921' '571' '391' '405' '1158' '738' '521'
 '927' '777' '577' '612' '786' '755' '495' '646' '664' '626' '530' '588'
 '423' '550' '567' '444' '476' '370' '280' '1209' '703' '984' '1070' '587'
 '684' '754' '815' '753' '593' '606' '554' '585' '532' '539' '483' '463'
 '615' '1392' '276' '986' '1130' '792' '1003' '800' '771' '637' '953'
 '690' '966' '806' '779' '773' '568' '709' '656' '557' '687' '544' '1281'
 '1252' '1046' '757' '564' '807' '739' '634' '447' '472' '752' '794' '721'
 '562' '489' '1147' '1096' '778' '693' '763' '1016' '791' '589' '797'
 '783' '492' '559' '894' '910' '1140' '1100' '916' '639' '525' '990' '864'
 '673' '857' '845' '920' '766' '740' '843' '780' '824' '520' '784' '776'
 '633' '722' '725' '630' '512' '1179' '1393' '1193' '1082' '923' '890'
 '736' '553' '528' '760' '715' '617' '590' '1103' '723' '804' '702' '583'
 '494' '1445' '1430' '1127' '1128' '1037' '1107' '874' '591' '876' '987'
 '892' '888' '854' '868' '834' '851' '751' '720' '711' '674' '620' '661'
 '640' '563' '561' '915' '1063' '761' '718' '1336' '1230' '1079' '1105'
 '983' '787' '853' '750' '765' '648' '1199' '682' '1066' '607' '555'
 '1206' '820' '522' '417' '518' '1203' '669' '1033' '781' '989' '825'
 '619' '785' '741' '651' '821' '428' '708' '660' '1391' '1040' '891' '861'
 '668' '629' '632' '1420' '908' '1168' '832' '679' '943' '1153' '944'
 '1030' '1056' '974' '685' '1159' '1337' '716' '704' '1114' '1275' '1324'
 '1270' '1110' '947' '524' '1042' '1304' '1049' '1116' '897' '867' '818'
 '712' '1076' '458' '988' '965' '822' '699' '1186' '799' '1010' '848'
 '907' '814' '1210' '764' '887' '837' '836' '1377' '714' '732' '546'
 '1028' '1091' '1350' '1313' '1301' '1072' '724' '1022' '942' '852' '905'
 '758' '1085' '901' '1012' '996' '982' '1380' '1201' '1283' '1027' '748'
 '1417' '1087']
['@gmail.com' '@hotmail.com' '@yahoo.com' '@icloud.com' 'other'
 '@outlook.com' '@ymail.com' '@live.com' '@163.com' '@qq.com'
 '@rocketmail.com']
In [11]:
#date
traindf['grass_date'] = pd.to_datetime(traindf['grass_date']).dt.date
testdf['grass_date'] = pd.to_datetime(testdf['grass_date']).dt.date
In [12]:
traindf['year'] = pd.to_datetime(traindf['grass_date']).dt.year
traindf['month'] = pd.to_datetime(traindf['grass_date']).dt.month
traindf['day'] = pd.to_datetime(traindf['grass_date']).dt.day

testdf['year'] = pd.to_datetime(testdf['grass_date']).dt.year
testdf['month'] = pd.to_datetime(testdf['grass_date']).dt.month
testdf['day'] = pd.to_datetime(testdf['grass_date']).dt.day
In [13]:
#as type
traindf['last_open_day'] = traindf['last_open_day'].replace('Never open', 0)
traindf['last_login_day'] = traindf['last_login_day'].replace('Never login', 0)
traindf['last_checkout_day'] = traindf['last_checkout_day'].replace('Never checkout', 0)

testdf['last_open_day'] = testdf['last_open_day'].replace('Never open', 0)
testdf['last_login_day'] = testdf['last_login_day'].replace('Never login', 0)
testdf['last_checkout_day'] = testdf['last_checkout_day'].replace('Never checkout', 0)
In [14]:
for col in ['last_open_day','last_login_day','last_checkout_day']:
    traindf[col] = traindf[col].astype(np.int32)
In [15]:
for col in ['last_open_day','last_login_day','last_checkout_day']:
    testdf[col] = testdf[col].astype(np.int32)
In [16]:
#missing variable
msno.matrix(traindf) 
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x280c5045d48>
In [17]:
traindf['attr_1'].fillna(2.0, inplace = True)
traindf['attr_2'].fillna(traindf['attr_2'].value_counts().index[0], inplace=True)
In [18]:
testdf['attr_1'].fillna(2.0, inplace = True)
testdf['attr_2'].fillna(testdf['attr_2'].value_counts().index[0], inplace=True)
In [19]:
import statistics 
median = round(traindf['age'].median())
std = traindf['age'].std()
outliers = (traindf['age']-median).abs()>std
traindf['age'][outliers]=np.nan
traindf['age'].fillna(median, inplace=True)
C:\Users\USER\anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
In [20]:
median = round(testdf['age'].median())
std = testdf['age'].std()
outliers = (testdf['age']-median).abs()>std
testdf['age'][outliers]=np.nan
testdf['age'].fillna(median, inplace=True)
C:\Users\USER\anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
In [21]:
msno.matrix(traindf) 
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x280c66b3b08>

Target variables

In [22]:
#imbalance
fig = px.histogram(traindf, x="open_flag")
fig.show()
In [23]:
traindf['open_flag'].value_counts(normalize = True)
Out[23]:
0    0.844219
1    0.155781
Name: open_flag, dtype: float64

Input variables

In [24]:
del traindf['grass_date'], testdf['grass_date'], traindf['user_id'], testdf['user_id'], traindf['row_id'], testdf['row_id']
In [25]:
cat_features = ['country_code', 'domain', 'year', 'month', 'day', 'attr_1', 'attr_2', 'attr_3']
num_features = [col for col in traindf.columns if col != "open_flag" and col not in cat_features]
print(cat_features, num_features)
['country_code', 'domain', 'year', 'month', 'day', 'attr_1', 'attr_2', 'attr_3'] ['subject_line_length', 'last_open_day', 'last_login_day', 'last_checkout_day', 'open_count_last_10_days', 'open_count_last_30_days', 'open_count_last_60_days', 'login_count_last_10_days', 'login_count_last_30_days', 'login_count_last_60_days', 'checkout_count_last_10_days', 'checkout_count_last_30_days', 'checkout_count_last_60_days', 'age']
In [26]:
#continuous variable
def plot_feature_scatter(df1, df2, features):
    i = 0
    sns.set_style("whitegrid")
    plt.figure
    
    fig, ax = plt.subplots(4, 4, figsize=(20, 20))
    
    for feature in features:
        i += 1
        plt.subplot(4, 4, i)
        plt.scatter(df1[feature], df2[feature], marker="+", color='#2B3A67', alpha=0.2)
        plt.xlabel(feature, fontsize=9)
        
    plt.show()
In [27]:
#scalization. normalization     
plot_feature_scatter(traindf.sample(50000), testdf.sample(50000), num_features)
In [28]:
plt.figure(figsize=(14,10))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(traindf[num_features].corr(), cmap = "BrBG",
linewidths=0.1, square=True, linecolor='white',
annot=True)
plt.show()
In [29]:
def plot_feature_distribution(df1, df2, label1, label2, features, size=[3,5]):
    i = 0
    sns.set_style("whitegrid")
    
    plt.figure()
    fig, ax = plt.subplots(size[0], size[1], figsize=(22, 20))
    
    for feature in features:
        i += 1
        plt.subplot(size[0], size[1], i)
        
        sns.distplot(df1[feature], hist=False, label=label1, kde_kws={'bw':1.5}, color="#5982C5")
        sns.distplot(df2[feature], hist=False, label=label2, kde_kws={'bw':1.5}, color="#FB3523")
        
        plt.xlabel(feature, fontsize=9)
        
        locs, labels = plt.xticks()
        plt.tick_params(axis="x", which="major", labelsize=6, pad=6)
        plt.tick_params(axis="y", which="major", labelsize=6)
        
    plt.show()
In [30]:
target_0_df = traindf.loc[traindf["open_flag"] == 0]
target_1_df = traindf.loc[traindf["open_flag"] == 1]

plot_feature_distribution(target_0_df, target_1_df, '0', '1', num_features, size=[3,5])
<Figure size 432x288 with 0 Axes>
In [31]:
#category variable
lbls, freqs = np.unique(traindf['country_code'].values, return_counts = True)

plt.figure(figsize = (10, 10))
plt.title('Train - Country Code')
plt.pie(freqs, labels = lbls, autopct = '%1.1f%%', shadow = False, startangle = 90)
plt.show()
In [32]:
target = traindf['open_flag'].copy()
del traindf['open_flag']
target
Out[32]:
0        0
1        1
2        0
3        0
4        0
        ..
73534    0
73535    0
73536    0
73537    1
73538    0
Name: open_flag, Length: 73539, dtype: int64
In [33]:
get_summary_df(traindf)
Out[33]:
column dtype unique_count missing_count missing_percentage total_count
0 country_code int64 7 0 0.0 73539
1 subject_line_length int64 56 0 0.0 73539
2 month int64 3 0 0.0 73539
3 year int64 1 0 0.0 73539
4 domain object 11 0 0.0 73539
5 age float64 23 0 0.0 73539
6 attr_3 float64 5 0 0.0 73539
7 attr_2 float64 2 0 0.0 73539
8 attr_1 float64 3 0 0.0 73539
9 checkout_count_last_60_days int64 219 0 0.0 73539
10 checkout_count_last_30_days int64 142 0 0.0 73539
11 checkout_count_last_10_days int64 71 0 0.0 73539
12 login_count_last_60_days int64 637 0 0.0 73539
13 login_count_last_30_days int64 376 0 0.0 73539
14 login_count_last_10_days int64 168 0 0.0 73539
15 open_count_last_60_days int64 76 0 0.0 73539
16 open_count_last_30_days int64 46 0 0.0 73539
17 open_count_last_10_days int64 20 0 0.0 73539
18 last_checkout_day int32 947 0 0.0 73539
19 last_login_day int32 892 0 0.0 73539
20 last_open_day int32 549 0 0.0 73539
21 day int64 30 0 0.0 73539

Step 2 Modeling

In [34]:
from sklearn.model_selection import train_test_split
# Train_Test Split
Xtrain, Xvalid, ytrain, yvalid = train_test_split(traindf,target,test_size=0.30)
In [35]:
 ytrain
Out[35]:
60623    0
47478    0
15265    0
14619    0
37078    0
        ..
1862     0
56506    0
38514    0
71515    0
23195    0
Name: open_flag, Length: 51477, dtype: int64
In [36]:
#preprocessing
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder, MinMaxScaler
preprocessor = make_column_transformer(
    (OneHotEncoder(), cat_features),
    (MinMaxScaler(), num_features),
    remainder='passthrough'
)
#preprocessor1 = make_column_transformer(
#    (OneHotEncoder(), cat_features),
#    (StandardScaler(), num_features),
#    remainder='passthrough'
#)
In [37]:
#use the model to deal with the imbalance
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

models = []

#linear model
models.append(("simple_logistic", 
               make_pipeline(preprocessor,
                             LogisticRegression())))
models.append(("simple_logistic_elaticnet", 
               make_pipeline(preprocessor,
                             SGDClassifier(loss='log', penalty='elasticnet', alpha=0.0001, l1_ratio=0.15))))
models.append(("naivebayes", 
               make_pipeline(preprocessor,
                             BernoulliNB())))
#non-linear model
models.append(("random_forest", 
               make_pipeline(preprocessor,
                             RandomForestClassifier(max_depth=2, random_state=0))))

models.append(("xgboost", 
               make_pipeline(preprocessor,
                             xgb.XGBClassifier())))

models.append(("lgboost", 
               make_pipeline(preprocessor,
                             lgb.LGBMClassifier())))
In [38]:
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.model_selection import RepeatedKFold, cross_validate
from sklearn.metrics import matthews_corrcoef
#k-fold
my_ctrl = RepeatedKFold(n_splits=5, n_repeats=2)
#performance matrix
mcc = make_scorer(matthews_corrcoef)
my_metric =(mcc)
In [39]:
from numpy import absolute, mean, std
model_results = []
names = []
for name, model in models:
    cv_results = cross_validate(model, Xtrain, ytrain, scoring=my_metric, cv=my_ctrl, n_jobs=-1)
    model_results.append(cv_results["test_score"])
    names.append(name)
    msg = "%s:(%s) %f (%f)" % (name, model[-1], mean(absolute(model_results)), std(absolute(model_results)))
    print(msg)
simple_logistic:(LogisticRegression()) 0.502675 (0.008331)
simple_logistic_elaticnet:(SGDClassifier(loss='log', penalty='elasticnet')) 0.502233 (0.014895)
naivebayes:(BernoulliNB()) 0.456289 (0.066650)
random_forest:(RandomForestClassifier(max_depth=2, random_state=0)) 0.342217 (0.205837)
xgboost:(XGBClassifier()) 0.377162 (0.196966)
lgboost:(LGBMClassifier()) 0.402097 (0.188286)
In [40]:
Mcc = pd.DataFrame(absolute(model_results),index=names).T
In [41]:
import plotly.graph_objects as go
fig = go.Figure()
for name in names:
    fig.add_trace(go.Box(x=Mcc[name],name=name))
fig.update_layout(title_text="Mcc")
fig.show()
In [42]:
Xprocess = preprocessor.fit_transform(Xtrain)
In [43]:
from hyperopt import hp, fmin, tpe, Trials, space_eval
from hyperopt.pyll.base import scope
import time
from sklearn.model_selection import cross_val_score
# objective function
def ho_func(params):
    estimator = lgb.LGBMClassifier(objective='binary',
                         boosting_type='goss',
                         is_unbalance=False,
                         learning_rate=0.03,
                         metric='roc_auc,log_loss', n_estimators=200,
                         class_weight=params['class_weight'],
                         num_leaves=31, random_state=42, max_depth=-1, reg_lambda=5, reg_alpha=1)
    estimator.fit(Xprocess, ytrain)
    # pred_array = estimator.predict(X_test)
    # res = accuracy_score(y_test, pred_array)
    # pred_test = estimator.predict(X_test)
    # res = roc_auc_score(y_test, pred_test)
    res = cross_val_score(estimator, Xprocess, ytrain,
                          scoring=mcc, cv=3).mean()
    print("*" * 30)
    print(params)
    print("cv mcc: {}".format(res))

    return 1-res
In [44]:
weight_list = [{0: one} for one in list(np.arange(0, 10, 0.1))]
ho_space = {
    'max_depth': hp.choice('max_depth', range(5, 30, 1)),
    # 'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'n_estimators': hp.choice('n_estimators', range(100, 1000, 100)),
    # 'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    # 'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
    # 'min_data_in_leaf': hp.choice('min_data_in_leaf', range(3, 15, 2)),
    'reg_alpha': hp.quniform('reg_alpha', 0, 3,0.1),
    'reg_lambda': hp.quniform('reg_lambda', 0, 5,0.1),
    'num_leaves': hp.choice('num_leaves', range(20, 50, 1)),
    'class_weight': hp.choice('class_weight', weight_list), 'verbose': -1}

start=time.time()
trials=Trials()
best=fmin(fn=ho_func, space=ho_space, algo=tpe.suggest,
          max_evals=100, trials=trials)
print(f'Total time : {(time.time()-start):.5f} seconds')
print(f'Best Parameters - val_loss:{np.asarray(trials.losses(), dtype=float).max()} for {space_eval(ho_space,best)}')
lbm_model=lgb.LGBMClassifier().set_params(**space_eval(ho_space, best))
lbm_model
******************************                                                                                         
{'class_weight': {0: 7.9}, 'max_depth': 6, 'n_estimators': 400, 'num_leaves': 41, 'reg_alpha': 0.6000000000000001, 'reg_lambda': 0.4, 'verbose': -1}
cv mcc: 0.24776130880457206                                                                                            
******************************                                                                                         
{'class_weight': {0: 6.300000000000001}, 'max_depth': 21, 'n_estimators': 600, 'num_leaves': 37, 'reg_alpha': 1.9000000000000001, 'reg_lambda': 4.3, 'verbose': -1}
cv mcc: 0.2893787071261454                                                                                             
******************************                                                                                         
{'class_weight': {0: 2.4000000000000004}, 'max_depth': 26, 'n_estimators': 600, 'num_leaves': 41, 'reg_alpha': 2.3000000000000003, 'reg_lambda': 0.1, 'verbose': -1}
cv mcc: 0.45072675756676756                                                                                            
******************************                                                                                         
{'class_weight': {0: 4.0}, 'max_depth': 21, 'n_estimators': 200, 'num_leaves': 48, 'reg_alpha': 2.1, 'reg_lambda': 0.2, 'verbose': -1}
cv mcc: 0.37775561753211423                                                                                            
******************************                                                                                         
{'class_weight': {0: 6.2}, 'max_depth': 8, 'n_estimators': 900, 'num_leaves': 33, 'reg_alpha': 0.5, 'reg_lambda': 2.5, 'verbose': -1}
cv mcc: 0.29215950180777334                                                                                            
******************************                                                                                         
{'class_weight': {0: 2.3000000000000003}, 'max_depth': 25, 'n_estimators': 800, 'num_leaves': 25, 'reg_alpha': 1.3, 'reg_lambda': 1.4000000000000001, 'verbose': -1}
cv mcc: 0.4565230346267158                                                                                             
******************************                                                                                         
{'class_weight': {0: 5.9}, 'max_depth': 24, 'n_estimators': 400, 'num_leaves': 38, 'reg_alpha': 0.9, 'reg_lambda': 1.1, 'verbose': -1}
cv mcc: 0.2993087091975715                                                                                             
******************************                                                                                         
{'class_weight': {0: 6.5}, 'max_depth': 28, 'n_estimators': 100, 'num_leaves': 24, 'reg_alpha': 1.5, 'reg_lambda': 2.7, 'verbose': -1}
cv mcc: 0.2823876104662914                                                                                             
******************************                                                                                         
{'class_weight': {0: 6.800000000000001}, 'max_depth': 6, 'n_estimators': 700, 'num_leaves': 46, 'reg_alpha': 1.0, 'reg_lambda': 4.0, 'verbose': -1}
cv mcc: 0.2718061719731431                                                                                             
******************************                                                                                         
{'class_weight': {0: 4.5}, 'max_depth': 18, 'n_estimators': 400, 'num_leaves': 21, 'reg_alpha': 2.0, 'reg_lambda': 0.1, 'verbose': -1}
cv mcc: 0.35942241812609393                                                                                            
******************************                                                                                         
{'class_weight': {0: 8.700000000000001}, 'max_depth': 19, 'n_estimators': 500, 'num_leaves': 39, 'reg_alpha': 0.4, 'reg_lambda': 2.5, 'verbose': -1}
cv mcc: 0.2258884117946742                                                                                             
******************************                                                                                         
{'class_weight': {0: 2.6}, 'max_depth': 25, 'n_estimators': 600, 'num_leaves': 43, 'reg_alpha': 0.6000000000000001, 'reg_lambda': 1.9000000000000001, 'verbose': -1}
cv mcc: 0.4389916578290467                                                                                             
******************************                                                                                         
{'class_weight': {0: 4.2}, 'max_depth': 20, 'n_estimators': 300, 'num_leaves': 24, 'reg_alpha': 1.0, 'reg_lambda': 3.6, 'verbose': -1}
cv mcc: 0.3709259537480904                                                                                             
******************************                                                                                         
{'class_weight': {0: 8.5}, 'max_depth': 21, 'n_estimators': 900, 'num_leaves': 38, 'reg_alpha': 1.9000000000000001, 'reg_lambda': 1.0, 'verbose': -1}
cv mcc: 0.23320017758120407                                                                                            
******************************                                                                                         
{'class_weight': {0: 5.300000000000001}, 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 33, 'reg_alpha': 1.8, 'reg_lambda': 2.5, 'verbose': -1}
cv mcc: 0.32802555811594275                                                                                            
******************************                                                                                         
{'class_weight': {0: 4.3}, 'max_depth': 18, 'n_estimators': 400, 'num_leaves': 27, 'reg_alpha': 1.3, 'reg_lambda': 1.0, 'verbose': -1}
cv mcc: 0.36359839011274103                                                                                            
******************************                                                                                         
{'class_weight': {0: 8.3}, 'max_depth': 27, 'n_estimators': 400, 'num_leaves': 40, 'reg_alpha': 2.0, 'reg_lambda': 3.0, 'verbose': -1}
cv mcc: 0.23963212055588437                                                                                            
******************************                                                                                         
{'class_weight': {0: 8.700000000000001}, 'max_depth': 17, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 0.0, 'reg_lambda': 0.6000000000000001, 'verbose': -1}
cv mcc: 0.2258884117946742                                                                                             
******************************                                                                                         
{'class_weight': {0: 2.9000000000000004}, 'max_depth': 22, 'n_estimators': 500, 'num_leaves': 30, 'reg_alpha': 2.7, 'reg_lambda': 3.6, 'verbose': -1}
cv mcc: 0.42810846222381044                                                                                            
******************************                                                                                         
{'class_weight': {0: 5.9}, 'max_depth': 14, 'n_estimators': 800, 'num_leaves': 27, 'reg_alpha': 1.0, 'reg_lambda': 1.7000000000000002, 'verbose': -1}
cv mcc: 0.2993087091975715                                                                                             
******************************                                                                                         
{'class_weight': {0: 3.9000000000000004}, 'max_depth': 26, 'n_estimators': 800, 'num_leaves': 25, 'reg_alpha': 2.6, 'reg_lambda': 1.8, 'verbose': -1}
cv mcc: 0.37929696759408976                                                                                            
******************************                                                                                         
{'class_weight': {0: 2.3000000000000003}, 'max_depth': 16, 'n_estimators': 600, 'num_leaves': 34, 'reg_alpha': 3.0, 'reg_lambda': 1.4000000000000001, 'verbose': -1}
cv mcc: 0.4565230346267158                                                                                             
******************************                                                                                         
{'class_weight': {0: 9.700000000000001}, 'max_depth': 16, 'n_estimators': 800, 'num_leaves': 34, 'reg_alpha': 3.0, 'reg_lambda': 1.5, 'verbose': -1}
cv mcc: 0.21023701459709276                                                                                            
******************************                                                                                         
{'class_weight': {0: 6.6000000000000005}, 'max_depth': 12, 'n_estimators': 800, 'num_leaves': 26, 'reg_alpha': 1.5, 'reg_lambda': 4.9, 'verbose': -1}
cv mcc: 0.27892884417883285                                                                                            
******************************                                                                                         
{'class_weight': {0: 9.3}, 'max_depth': 16, 'n_estimators': 300, 'num_leaves': 34, 'reg_alpha': 1.3, 'reg_lambda': 2.0, 'verbose': -1}
cv mcc: 0.21481640920418196                                                                                            
******************************                                                                                         
{'class_weight': {0: 2.3000000000000003}, 'max_depth': 23, 'n_estimators': 200, 'num_leaves': 23, 'reg_alpha': 2.9000000000000004, 'reg_lambda': 1.3, 'verbose': -1}
cv mcc: 0.4565230346267158                                                                                             
******************************                                                                                         
{'class_weight': {0: 3.8000000000000003}, 'max_depth': 23, 'n_estimators': 200, 'num_leaves': 44, 'reg_alpha': 3.0, 'reg_lambda': 0.8, 'verbose': -1}
cv mcc: 0.3850619006717337                                                                                             
******************************                                                                                         
{'class_weight': {0: 2.3000000000000003}, 'max_depth': 5, 'n_estimators': 600, 'num_leaves': 28, 'reg_alpha': 0.1, 'reg_lambda': 2.2, 'verbose': -1}
cv mcc: 0.4565230346267158                                                                                             
******************************                                                                                         
{'class_weight': {0: 8.1}, 'max_depth': 5, 'n_estimators': 600, 'num_leaves': 28, 'reg_alpha': 0.1, 'reg_lambda': 3.0, 'verbose': -1}
cv mcc: 0.24236322737831587                                                                                            
******************************                                                                                         
{'class_weight': {0: 9.5}, 'max_depth': 7, 'n_estimators': 600, 'num_leaves': 28, 'reg_alpha': 0.2, 'reg_lambda': 2.0, 'verbose': -1}
cv mcc: 0.2140256681369249                                                                                             
******************************                                                                                         
{'class_weight': {0: 9.8}, 'max_depth': 5, 'n_estimators': 600, 'num_leaves': 36, 'reg_alpha': 2.4000000000000004, 'reg_lambda': 3.1, 'verbose': -1}
cv mcc: 0.20653907414037256                                                                                            
******************************                                                                                         
{'class_weight': {0: 9.9}, 'max_depth': 29, 'n_estimators': 700, 'num_leaves': 20, 'reg_alpha': 1.7000000000000002, 'reg_lambda': 0.5, 'verbose': -1}
cv mcc: 0.20736820582685497                                                                                            
******************************                                                                                         
{'class_weight': {0: 3.5}, 'max_depth': 10, 'n_estimators': 600, 'num_leaves': 45, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 2.1, 'verbose': -1}
cv mcc: 0.39774678530606433                                                                                            
******************************                                                                                         
{'class_weight': {0: 3.2}, 'max_depth': 13, 'n_estimators': 200, 'num_leaves': 22, 'reg_alpha': 2.3000000000000003, 'reg_lambda': 2.2, 'verbose': -1}
cv mcc: 0.4124444909358525                                                                                             
******************************                                                                                         
{'class_weight': {0: 2.3000000000000003}, 'max_depth': 23, 'n_estimators': 200, 'num_leaves': 23, 'reg_alpha': 2.8000000000000003, 'reg_lambda': 1.5, 'verbose': -1}
cv mcc: 0.4565230346267158                                                                                             
******************************                                                                                         
{'class_weight': {0: 4.1000000000000005}, 'max_depth': 9, 'n_estimators': 200, 'num_leaves': 42, 'reg_alpha': 2.7, 'reg_lambda': 0.30000000000000004, 'verbose': -1}
cv mcc: 0.37218266937059014                                                                                            
******************************                                                                                         
{'class_weight': {0: 1.8}, 'max_depth': 15, 'n_estimators': 900, 'num_leaves': 23, 'reg_alpha': 2.5, 'reg_lambda': 1.3, 'verbose': -1}
cv mcc: 0.48043449216253037                                                                                            
******************************                                                                                         
{'class_weight': {0: 8.9}, 'max_depth': 15, 'n_estimators': 900, 'num_leaves': 23, 'reg_alpha': 2.3000000000000003, 'reg_lambda': 1.6, 'verbose': -1}
cv mcc: 0.22253588517082548                                                                                            
******************************                                                                                         
{'class_weight': {0: 5.0}, 'max_depth': 11, 'n_estimators': 900, 'num_leaves': 23, 'reg_alpha': 2.5, 'reg_lambda': 1.2000000000000002, 'verbose': -1}
cv mcc: 0.3318447353957221                                                                                             
******************************                                                                                         
{'class_weight': {0: 9.1}, 'max_depth': 8, 'n_estimators': 900, 'num_leaves': 29, 'reg_alpha': 2.2, 'reg_lambda': 0.0, 'verbose': -1}
cv mcc: 0.22002592865436008                                                                                            
******************************                                                                                         
{'class_weight': {0: 1.8}, 'max_depth': 15, 'n_estimators': 900, 'num_leaves': 35, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 3.4000000000000004, 'verbose': -1}
cv mcc: 0.48043449216253037                                                                                            
******************************                                                                                         
{'class_weight': {0: 1.8}, 'max_depth': 15, 'n_estimators': 900, 'num_leaves': 35, 'reg_alpha': 0.8, 'reg_lambda': 4.6000000000000005, 'verbose': -1}
cv mcc: 0.48043449216253037                                                                                            
******************************                                                                                         
{'class_weight': {0: 1.8}, 'max_depth': 15, 'n_estimators': 900, 'num_leaves': 35, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 3.8000000000000003, 'verbose': -1}
cv mcc: 0.48043449216253037                                                                                            
******************************                                                                                         
{'class_weight': {0: 4.4}, 'max_depth': 15, 'n_estimators': 900, 'num_leaves': 47, 'reg_alpha': 0.4, 'reg_lambda': 4.0, 'verbose': -1}
cv mcc: 0.3686143245218358                                                                                             
******************************                                                                                         
{'class_weight': {0: 7.1000000000000005}, 'max_depth': 24, 'n_estimators': 700, 'num_leaves': 35, 'reg_alpha': 0.8, 'reg_lambda': 4.800000000000001, 'verbose': -1}
cv mcc: 0.26593457617013017                                                                                            
******************************                                                                                         
{'class_weight': {0: 4.7}, 'max_depth': 19, 'n_estimators': 500, 'num_leaves': 49, 'reg_alpha': 1.6, 'reg_lambda': 3.3000000000000003, 'verbose': -1}
cv mcc: 0.34600729107788397                                                                                            
******************************                                                                                         
{'class_weight': {0: 2.2}, 'max_depth': 28, 'n_estimators': 900, 'num_leaves': 32, 'reg_alpha': 1.2000000000000002, 'reg_lambda': 4.5, 'verbose': -1}
cv mcc: 0.4604141087336358                                                                                             
******************************                                                                                         
{'class_weight': {0: 2.8000000000000003}, 'max_depth': 15, 'n_estimators': 300, 'num_leaves': 21, 'reg_alpha': 2.1, 'reg_lambda': 2.8000000000000003, 'verbose': -1}
cv mcc: 0.43353445429395626                                                                                            
******************************                                                                                         
{'class_weight': {0: 1.1}, 'max_depth': 27, 'n_estimators': 100, 'num_leaves': 41, 'reg_alpha': 0.4, 'reg_lambda': 4.3, 'verbose': -1}
cv mcc: 0.5208682226807307                                                                                             
******************************                                                                                         
{'class_weight': {0: 6.9}, 'max_depth': 27, 'n_estimators': 100, 'num_leaves': 41, 'reg_alpha': 0.5, 'reg_lambda': 4.1000000000000005, 'verbose': -1}
cv mcc: 0.2708126892339995                                                                                             
******************************                                                                                         
{'class_weight': {0: 2.0}, 'max_depth': 27, 'n_estimators': 100, 'num_leaves': 37, 'reg_alpha': 1.2000000000000002, 'reg_lambda': 0.8, 'verbose': -1}
cv mcc: 0.4696935542543224                                                                                             
******************************                                                                                         
{'class_weight': {0: 1.7000000000000002}, 'max_depth': 20, 'n_estimators': 100, 'num_leaves': 41, 'reg_alpha': 0.6000000000000001, 'reg_lambda': 3.8000000000000003, 'verbose': -1}
cv mcc: 0.48823198527589134                                                                                            
******************************                                                                                         
{'class_weight': {0: 1.1}, 'max_depth': 25, 'n_estimators': 100, 'num_leaves': 41, 'reg_alpha': 1.1, 'reg_lambda': 4.3, 'verbose': -1}
cv mcc: 0.5208682226807307                                                                                             
******************************                                                                                         
{'class_weight': {0: 1.1}, 'max_depth': 25, 'n_estimators': 100, 'num_leaves': 41, 'reg_alpha': 1.1, 'reg_lambda': 4.5, 'verbose': -1}
cv mcc: 0.5208682226807307                                                                                             
******************************                                                                                         
{'class_weight': {0: 1.1}, 'max_depth': 25, 'n_estimators': 100, 'num_leaves': 41, 'reg_alpha': 1.1, 'reg_lambda': 4.3, 'verbose': -1}
cv mcc: 0.5208682226807307                                                                                             
******************************                                                                                         
{'class_weight': {0: 1.9000000000000001}, 'max_depth': 25, 'n_estimators': 100, 'num_leaves': 40, 'reg_alpha': 1.4000000000000001, 'reg_lambda': 4.800000000000001, 'verbose': -1}
cv mcc: 0.4745139018492157                                                                                             
******************************                                                                                         
{'class_weight': {0: 4.800000000000001}, 'max_depth': 25, 'n_estimators': 400, 'num_leaves': 48, 'reg_alpha': 1.8, 'reg_lambda': 4.6000000000000005, 'verbose': -1}
cv mcc: 0.34165434853327437                                                                                            
******************************                                                                                         
{'class_weight': {0: 9.4}, 'max_depth': 22, 'n_estimators': 100, 'num_leaves': 43, 'reg_alpha': 0.9, 'reg_lambda': 5.0, 'verbose': -1}
cv mcc: 0.21491116691474968                                                                                            
******************************                                                                                         
{'class_weight': {0: 8.6}, 'max_depth': 21, 'n_estimators': 500, 'num_leaves': 39, 'reg_alpha': 0.0, 'reg_lambda': 4.2, 'verbose': -1}
cv mcc: 0.22679290940309427                                                                                            
******************************                                                                                         
{'class_weight': {0: 6.0}, 'max_depth': 17, 'n_estimators': 100, 'num_leaves': 24, 'reg_alpha': 0.4, 'reg_lambda': 3.9000000000000004, 'verbose': -1}
cv mcc: 0.299726708351451                                                                                              
******************************                                                                                         
{'class_weight': {0: 5.4}, 'max_depth': 27, 'n_estimators': 300, 'num_leaves': 46, 'reg_alpha': 1.4000000000000001, 'reg_lambda': 3.6, 'verbose': -1}
cv mcc: 0.32338661925125595                                                                                            
******************************                                                                                         
{'class_weight': {0: 3.1}, 'max_depth': 14, 'n_estimators': 700, 'num_leaves': 30, 'reg_alpha': 1.6, 'reg_lambda': 2.7, 'verbose': -1}
cv mcc: 0.4138918238488831                                                                                             
******************************                                                                                         
{'class_weight': {0: 7.4}, 'max_depth': 25, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 1.1, 'reg_lambda': 3.3000000000000003, 'verbose': -1}
cv mcc: 0.2541042722829792                                                                                             
******************************                                                                                         
{'class_weight': {0: 3.0}, 'max_depth': 26, 'n_estimators': 400, 'num_leaves': 33, 'reg_alpha': 0.5, 'reg_lambda': 4.3, 'verbose': -1}
cv mcc: 0.42378592393566866                                                                                            
******************************                                                                                         
{'class_weight': {0: 2.7}, 'max_depth': 12, 'n_estimators': 800, 'num_leaves': 38, 'reg_alpha': 1.9000000000000001, 'reg_lambda': 5.0, 'verbose': -1}
cv mcc: 0.4370866307871614                                                                                             
******************************                                                                                         
{'class_weight': {0: 7.6000000000000005}, 'max_depth': 25, 'n_estimators': 100, 'num_leaves': 41, 'reg_alpha': 1.1, 'reg_lambda': 4.4, 'verbose': -1}
cv mcc: 0.2519703121879471                                                                                             
******************************                                                                                         
{'class_weight': {0: 1.1}, 'max_depth': 25, 'n_estimators': 100, 'num_leaves': 41, 'reg_alpha': 0.9, 'reg_lambda': 3.6, 'verbose': -1}
cv mcc: 0.5208682226807307                                                                                             
******************************                                                                                         
{'class_weight': {0: 1.1}, 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 41, 'reg_alpha': 0.9, 'reg_lambda': 3.5, 'verbose': -1}
cv mcc: 0.5208682226807307                                                                                             
******************************                                                                                         
{'class_weight': {0: 7.300000000000001}, 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 26, 'reg_alpha': 0.9, 'reg_lambda': 3.4000000000000004, 'verbose': -1}
cv mcc: 0.2619773740005146                                                                                             
******************************                                                                                         
{'class_weight': {0: 7.7}, 'max_depth': 10, 'n_estimators': 100, 'num_leaves': 44, 'reg_alpha': 0.2, 'reg_lambda': 3.0, 'verbose': -1}
cv mcc: 0.2539416040309949                                                                                             
******************************                                                                                         
{'class_weight': {0: 5.6000000000000005}, 'max_depth': 6, 'n_estimators': 500, 'num_leaves': 20, 'reg_alpha': 1.4000000000000001, 'reg_lambda': 3.7, 'verbose': -1}
cv mcc: 0.3164970991076627                                                                                             
******************************                                                                                         
{'class_weight': {0: 1.0}, 'max_depth': 18, 'n_estimators': 100, 'num_leaves': 25, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 4.1000000000000005, 'verbose': -1}
cv mcc: 0.5242720364257746                                                                                             
******************************                                                                                         
{'class_weight': {0: 1.2000000000000002}, 'max_depth': 18, 'n_estimators': 300, 'num_leaves': 36, 'reg_alpha': 0.7000000000000001, 'reg_lambda': 2.3000000000000003, 'verbose': -1}
cv mcc: 0.513839952160468                                                                                              
******************************                                                                                         
{'class_weight': {0: 1.0}, 'max_depth': 18, 'n_estimators': 700, 'num_leaves': 25, 'reg_alpha': 0.9, 'reg_lambda': 2.8000000000000003, 'verbose': -1}
cv mcc: 0.5242720364257746                                                                                             
******************************                                                                                         
{'class_weight': {0: 0.4}, 'max_depth': 7, 'n_estimators': 700, 'num_leaves': 25, 'reg_alpha': 0.8, 'reg_lambda': 2.4000000000000004, 'verbose': -1}
cv mcc: 0.5307806921338388                                                                                             
******************************                                                                                         
{'class_weight': {0: 8.8}, 'max_depth': 7, 'n_estimators': 700, 'num_leaves': 25, 'reg_alpha': 0.6000000000000001, 'reg_lambda': 2.8000000000000003, 'verbose': -1}
cv mcc: 0.22629186769796703                                                                                            
******************************                                                                                         
{'class_weight': {0: 1.0}, 'max_depth': 18, 'n_estimators': 700, 'num_leaves': 25, 'reg_alpha': 0.2, 'reg_lambda': 1.7000000000000002, 'verbose': -1}
cv mcc: 0.5242720364257746                                                                                             
******************************                                                                                         
{'class_weight': {0: 1.0}, 'max_depth': 18, 'n_estimators': 700, 'num_leaves': 45, 'reg_alpha': 0.1, 'reg_lambda': 1.8, 'verbose': -1}
cv mcc: 0.5242720364257746                                                                                             
******************************                                                                                         
{'class_weight': {0: 3.3000000000000003}, 'max_depth': 7, 'n_estimators': 700, 'num_leaves': 45, 'reg_alpha': 0.1, 'reg_lambda': 2.4000000000000004, 'verbose': -1}
cv mcc: 0.40764814520395026                                                                                            
******************************                                                                                         
{'class_weight': {0: 3.4000000000000004}, 'max_depth': 18, 'n_estimators': 700, 'num_leaves': 25, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 2.6, 'verbose': -1}
cv mcc: 0.4071974661301921                                                                                             
******************************                                                                                         
{'class_weight': {0: 9.200000000000001}, 'max_depth': 29, 'n_estimators': 700, 'num_leaves': 45, 'reg_alpha': 0.8, 'reg_lambda': 2.0, 'verbose': -1}
cv mcc: 0.20752629497613492                                                                                            
******************************                                                                                         
{'class_weight': {0: 0.8}, 'max_depth': 9, 'n_estimators': 700, 'num_leaves': 25, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 1.0, 'verbose': -1}
cv mcc: 0.532638084503226                                                                                              
******************************                                                                                         
{'class_weight': {0: 0.8}, 'max_depth': 9, 'n_estimators': 700, 'num_leaves': 27, 'reg_alpha': 0.0, 'reg_lambda': 0.8, 'verbose': -1}
cv mcc: 0.532638084503226                                                                                              
******************************                                                                                         
{'class_weight': {0: 0.8}, 'max_depth': 9, 'n_estimators': 700, 'num_leaves': 27, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.8, 'verbose': -1}
cv mcc: 0.532638084503226                                                                                              
******************************                                                                                         
{'class_weight': {0: 0.8}, 'max_depth': 9, 'n_estimators': 700, 'num_leaves': 27, 'reg_alpha': 0.0, 'reg_lambda': 0.7000000000000001, 'verbose': -1}
cv mcc: 0.532638084503226                                                                                              
******************************                                                                                         
{'class_weight': {0: 0.8}, 'max_depth': 9, 'n_estimators': 700, 'num_leaves': 27, 'reg_alpha': 0.0, 'reg_lambda': 0.2, 'verbose': -1}
cv mcc: 0.532638084503226                                                                                              
******************************                                                                                         
{'class_weight': {0: 0.8}, 'max_depth': 9, 'n_estimators': 700, 'num_leaves': 27, 'reg_alpha': 0.2, 'reg_lambda': 0.30000000000000004, 'verbose': -1}
cv mcc: 0.532638084503226                                                                                              
******************************                                                                                         
{'class_weight': {0: 7.800000000000001}, 'max_depth': 9, 'n_estimators': 800, 'num_leaves': 27, 'reg_alpha': 0.0, 'reg_lambda': 0.2, 'verbose': -1}
cv mcc: 0.25065013359600624                                                                                            
******************************                                                                                         
{'class_weight': {0: 0.30000000000000004}, 'max_depth': 9, 'n_estimators': 700, 'num_leaves': 22, 'reg_alpha': 0.5, 'reg_lambda': 1.0, 'verbose': -1}
cv mcc: 0.5251654627360786                                                                                             
******************************                                                                                         
{'class_weight': {0: 2.5}, 'max_depth': 9, 'n_estimators': 200, 'num_leaves': 27, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'verbose': -1}
cv mcc: 0.44438654209022976                                                                                            
******************************                                                                                         
{'class_weight': {0: 8.4}, 'max_depth': 24, 'n_estimators': 400, 'num_leaves': 42, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.6000000000000001, 'verbose': -1}
cv mcc: 0.2290130014077232                                                                                             
******************************                                                                                         
{'class_weight': {0: 0.8}, 'max_depth': 13, 'n_estimators': 700, 'num_leaves': 29, 'reg_alpha': 0.6000000000000001, 'reg_lambda': 0.5, 'verbose': -1}
cv mcc: 0.532638084503226                                                                                              
******************************                                                                                         
{'class_weight': {0: 7.0}, 'max_depth': 11, 'n_estimators': 600, 'num_leaves': 29, 'reg_alpha': 0.4, 'reg_lambda': 1.1, 'verbose': -1}
cv mcc: 0.2624440856335148                                                                                             
******************************                                                                                         
{'class_weight': {0: 8.200000000000001}, 'max_depth': 19, 'n_estimators': 700, 'num_leaves': 27, 'reg_alpha': 0.0, 'reg_lambda': 0.8, 'verbose': -1}
cv mcc: 0.2336167335650732                                                                                             
******************************                                                                                         
{'class_weight': {0: 3.6}, 'max_depth': 8, 'n_estimators': 500, 'num_leaves': 49, 'reg_alpha': 0.2, 'reg_lambda': 0.0, 'verbose': -1}
cv mcc: 0.39326577154306736                                                                                            
******************************                                                                                         
{'class_weight': {0: 4.6000000000000005}, 'max_depth': 13, 'n_estimators': 300, 'num_leaves': 32, 'reg_alpha': 0.6000000000000001, 'reg_lambda': 0.4, 'verbose': -1}
cv mcc: 0.35171998062229637                                                                                            
******************************                                                                                         
{'class_weight': {0: 1.4000000000000001}, 'max_depth': 13, 'n_estimators': 700, 'num_leaves': 29, 'reg_alpha': 1.0, 'reg_lambda': 0.6000000000000001, 'verbose': -1}
cv mcc: 0.5014179934720157                                                                                             
******************************                                                                                         
{'class_weight': {0: 7.2}, 'max_depth': 13, 'n_estimators': 800, 'num_leaves': 29, 'reg_alpha': 0.5, 'reg_lambda': 0.30000000000000004, 'verbose': -1}
cv mcc: 0.2608135690281553                                                                                             
******************************                                                                                         
{'class_weight': {0: 8.0}, 'max_depth': 28, 'n_estimators': 200, 'num_leaves': 48, 'reg_alpha': 0.1, 'reg_lambda': 0.4, 'verbose': -1}
cv mcc: 0.24317330613276847                                                                                            
******************************                                                                                         
{'class_weight': {0: 6.1000000000000005}, 'max_depth': 9, 'n_estimators': 400, 'num_leaves': 47, 'reg_alpha': 0.2, 'reg_lambda': 1.3, 'verbose': -1}
cv mcc: 0.2948938644351275                                                                                             
100%|████████████████████████████████████████████████| 100/100 [04:44<00:00,  2.84s/it, best loss: 0.46736191549677397]
Total time : 284.57569 seconds
Best Parameters - val_loss:0.7934609258596275 for {'class_weight': {0: 0.8}, 'max_depth': 9, 'n_estimators': 700, 'num_leaves': 25, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 1.0, 'verbose': -1}
Out[44]:
LGBMClassifier(class_weight={0: 0.8}, max_depth=9, n_estimators=700,
               num_leaves=25, reg_alpha=0.30000000000000004, reg_lambda=1.0,
               verbose=-1)
In [45]:
models.append(("lgboost_tune", 
               make_pipeline(preprocessor,
                             lbm_model)))
models[-1]
Out[45]:
('lgboost_tune',
 Pipeline(steps=[('columntransformer',
                  ColumnTransformer(remainder='passthrough',
                                    transformers=[('onehotencoder',
                                                   OneHotEncoder(),
                                                   ['country_code', 'domain',
                                                    'year', 'month', 'day',
                                                    'attr_1', 'attr_2',
                                                    'attr_3']),
                                                  ('minmaxscaler',
                                                   MinMaxScaler(),
                                                   ['subject_line_length',
                                                    'last_open_day',
                                                    'last_login_day',
                                                    'last_checkout_day',
                                                    'open_count_last_10_days',
                                                    'open_count_last...
                                                    'open_count_last_60_days',
                                                    'login_count_last_10_days',
                                                    'login_count_last_30_days',
                                                    'login_count_last_60_days',
                                                    'checkout_count_last_10_days',
                                                    'checkout_count_last_30_days',
                                                    'checkout_count_last_60_days',
                                                    'age'])])),
                 ('lgbmclassifier',
                  LGBMClassifier(class_weight={0: 0.8}, max_depth=9,
                                 n_estimators=700, num_leaves=25,
                                 reg_alpha=0.30000000000000004, reg_lambda=1.0,
                                 verbose=-1))]))
In [46]:
models[-1][1].fit(Xtrain, ytrain)
Out[46]:
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['country_code', 'domain',
                                                   'year', 'month', 'day',
                                                   'attr_1', 'attr_2',
                                                   'attr_3']),
                                                 ('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['subject_line_length',
                                                   'last_open_day',
                                                   'last_login_day',
                                                   'last_checkout_day',
                                                   'open_count_last_10_days',
                                                   'open_count_last...
                                                   'open_count_last_60_days',
                                                   'login_count_last_10_days',
                                                   'login_count_last_30_days',
                                                   'login_count_last_60_days',
                                                   'checkout_count_last_10_days',
                                                   'checkout_count_last_30_days',
                                                   'checkout_count_last_60_days',
                                                   'age'])])),
                ('lgbmclassifier',
                 LGBMClassifier(class_weight={0: 0.8}, max_depth=9,
                                n_estimators=700, num_leaves=25,
                                reg_alpha=0.30000000000000004, reg_lambda=1.0,
                                verbose=-1))])
In [47]:
models[-1][1].predict(Xvalid)
Out[47]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
In [48]:
matthews_corrcoef(yvalid,  models[-1][1].predict(Xvalid))
Out[48]:
0.5347237693736189

submit

In [49]:
testdf.head()
Out[49]:
country_code subject_line_length last_open_day last_login_day last_checkout_day open_count_last_10_days open_count_last_30_days open_count_last_60_days login_count_last_10_days login_count_last_30_days ... checkout_count_last_30_days checkout_count_last_60_days attr_1 attr_2 attr_3 age domain year month day
0 6 35 27 2 13 2 3 4 10 34 ... 6 18 2.0 1.0 0.0 29.0 @gmail.com 2019 9 3
1 6 35 7 5 383 1 1 1 5 5 ... 0 0 2.0 1.0 0.0 29.0 @gmail.com 2019 9 3
2 5 25 34 1 3 0 0 0 13 19 ... 2 2 1.0 1.0 2.0 33.0 @gmail.com 2019 9 3
3 1 36 63 5 5 0 0 0 43 110 ... 5 5 1.0 1.0 2.0 22.0 @yahoo.com 2019 9 3
4 5 23 6 5 54 0 0 0 4 12 ... 0 2 2.0 1.0 0.0 29.0 @gmail.com 2019 9 3

5 rows × 22 columns

In [50]:
predictions = models[-1][1].predict(testdf)
In [51]:
path = 'D:/USA 2021 Spring/Data_analysis/sample_submission_0_1.csv'
sampledf = pd.read_csv(path)
sampledf
Out[51]:
row_id open_flag
0 0 1
1 1 1
2 2 1
3 3 1
4 4 1
... ... ...
55965 55965 0
55966 55966 0
55967 55967 0
55968 55968 0
55969 55969 0

55970 rows × 2 columns

In [52]:
sampledf['open_flag'] = predictions
sampledf.to_csv('D:/USA 2021 Spring/Data_analysis/sample_sub_ShopeeEmail.csv', index = False)
sampledf
Out[52]:
row_id open_flag
0 0 0
1 1 0
2 2 0
3 3 0
4 4 0
... ... ...
55965 55965 0
55966 55966 0
55967 55967 0
55968 55968 0
55969 55969 1

55970 rows × 2 columns

In [53]:
fig = px.histogram(sampledf, x="open_flag")
fig.show()
In [54]:
sampledf['open_flag'].value_counts(normalize = True)
Out[54]:
0    0.894229
1    0.105771
Name: open_flag, dtype: float64